{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Based on these posts: \n",
    "* https://medium.com/@chrisfotache/text-classification-in-python-pipelines-nlp-nltk-tf-idf-xgboost-and-more-b83451a327e0\n",
    "* https://github.com/keisukeirie/Amazon_review_helpfulness_prediction\n",
    "* https://stackabuse.com/text-classification-with-bert-tokenizer-and-tf-2-0-in-python/\n",
    "* https://towardsdatascience.com/simple-bert-using-tensorflow-2-0-132cb19e9b22"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q boto3\n",
    "!pip install -q scikit-learn==0.20.3\n",
    "!pip install -q nltk==3.4.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "import sagemaker\n",
    "import pandas as pd\n",
    "\n",
    "sess   = sagemaker.Session()\n",
    "bucket = sess.default_bucket()\n",
    "role = sagemaker.get_execution_role()\n",
    "region = boto3.Session().region_name\n",
    "\n",
    "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create Scikit-Learn Pipeline with SGDClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline, FeatureUnion\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.decomposition import TruncatedSVD\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "\n",
    "pipeline = Pipeline([\n",
    "    ('features', FeatureUnion([\n",
    "        ('body', Pipeline([\n",
    "            ('body_text_selector', TextSelector('review_body')),\n",
    "            ('tfidf_vectorizer', TfidfVectorizer(tokenizer=Tokenizer, stop_words=\"english\",\n",
    "                     min_df=.0025, max_df=0.25, ngram_range=(1,3))),\n",
    "            ('svd', TruncatedSVD(algorithm='randomized', n_components=300)), #for XGB\n",
    "        ])),\n",
    "    ])),\n",
    "    ('classifier', SGDClassifier(learning_rate='optimal'))\n",
    "])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fit the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "pipeline.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Predict and calculate metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix\n",
    "\n",
    "test_preds = pipeline.predict(X_test)\n",
    "\n",
    "print('Accuracy: ', accuracy_score(y_test, test_preds))\n",
    "print('Precision: ', precision_score(y_test, test_preds, average=None))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(classification_report(y_test, test_preds))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cm = confusion_matrix(y_test, test_preds)\n",
    "df_cm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO:  Explain the classifier"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Test Metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import seaborn as sn\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "#plt.figure(figsize = (10,7))\n",
    "sn.set(font_scale=1.4) # for label size\n",
    "sn.heatmap(df_cm, annot=True, annot_kws={\"size\": 16}) # font size\n",
    "\n",
    "# TODO:  Balance the dataset before training\n",
    "# TODO:  Add labels to each quadrant (predicted y-axis, actual x-axis)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
